library(tidyverse)
library(lubridate)
library(hms)
library(quanteda)
library(quanteda.textstats)
library(quanteda.textplots)
library(igraph)

# Read in indo_dt df
indo_dt <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220729_indo_dt.rds")

# Time of day plot irrespective of date tweet posted shows two vertical bands. The plot is faceted by otsus to demonstrate that filtering out tweets mentioning special autonomy causes the vertical bands to disappear (the same can be done by filtering out tweets with zero engagement). Tweets are coloured by whether they mention otsus or not.

ggplot(indo_dt, aes(times, author_username, color = otsus)) +
  geom_point(shape=".", alpha = 0.05, show.legend = FALSE) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 90, vjust=0.5), axis.text.y = element_blank(), axis.ticks = element_blank()) +
  scale_x_time(breaks = hms(hours = seq(0, 24, 1)), expand = c(0, 0)) +
  scale_color_manual(values=c("#FFC20A", "#0C7BDC")) +
  labs(x = "Time of day, irrespective of date", y = "Distinct Twitter users")

ggsave(file = "figures/1_special_autonomy_tod.png", width=2400, height=2000, units="px")

# Create a data object that captures tweets sent in the minute starting 6:55:00am Jakarta time or the minute starting 8:00:00am Jakarta time, the two minutes with the highest frequency of posting, corresponding to the visible vertical bands in Figure 1 

vertical_bands <- indo_dt %>% 
  filter((as_hms("06:54:59") < times & times < hms::as_hms("06:56:00")) | (hms::as_hms("07:59:59") < times & times < hms::as_hms("08:01:00")))

# Confirmation that these two minutes are the highest concentrations of tweets, the next highest are adjacent
tweets_per_minute <- indo_dt %>%
  mutate(rounded_minutes = floor_date(jkt_date_time, unit = "minutes", week_start = getOption("lubridate.week.start", 7)), rounded_times = as_hms(rounded_minutes)) %>%
  group_by(otsus) %>%
  count(rounded_times, sort = TRUE) %>%
  mutate(zscore = (n - mean(n))/sd(n), percentile = percent_rank(n), prop = n/sum(n))

# First, some summary statistics for these vertical bands
# 3811 distinct authors
vertical_bands %>% distinct(author_id, .keep_all = TRUE) %>% nrow()

# 22589 of 23853 tweets in vertical bands are otsus, 22479 are original
vertical_bands %>% filter(otsus) %>% nrow
vertical_bands %>% filter(otsus & is_original) %>% nrow
vertical_bands %>% nrow()
indo_dt %>% filter(otsus & is_original) %>% nrow ##only 155388 original otsus tweets in total

# The first major analytical task for the vertical bands is to demonstrate that the tweets are duplicates or near duplicates of a much smaller set of distinct tweets. But the URLs generated by Twitter confound tests for duplicates or similarity and so the first step is to create a new clean_text with the URLs removed.

twitter_links <- "(https?)(://t.co/)(.*?)(?=\\\\n|\\s|'|$)"
vertical_bands_links_removed <- vertical_bands %>% mutate(clean_text = str_remove_all(text, twitter_links))

# We'll now use the Quanteda package/sub-packages to generate a similarity measure for each  possible pairing of the 23853 tweets in the vertical bands, using the clean_text column. 
# This approach is preferred to count or duplicated because requiring exact correspondence misses many tweets that are slightly paraphrased but to a human eye are clearly replicas or near replicas.
# We'll also use igraph to generate component numbers [ie sets of distinct tweets and their duplicates] that can be assigned to each tweet in the vertical bands.
# Some of these steps are quite time-consuming and memory-intensive.

# First, create a data object with just the details you need for the quanteda similarity analysis: the unique tweet identifiers, and the tweet text with the twitter-generated URLs removed
# rownames are set as the tweet ids
vblr_quanteda <- vertical_bands_links_removed %>% select(id, clean_text)
rownames(vblr_quanteda) <- vblr_quanteda$id

# Pull out tweet ids as a separate object to use further through analysis to name rows and columns
vblr_quanteda_ids <- vblr_quanteda$id

# Use quanteda dfm function to remove punctuation and create the tokenised quanteda data object that its textstat_simil function needs as input
my_dfm <- dfm(vblr_quanteda$clean_text, remove_punct = TRUE)

# use quanteda.textstats package function textstat_simil to conduct a jaccard similarity test
# pairwise on all 23853 tweets
# warning: this step is time-consuming
dfm_simil <- textstat_simil(my_dfm, margin = "documents", method = "jaccard")

# convert to a dataframe and then into tidy format using pivot_longer with one pairwise comparison per row - ie 23853x23853 rows
# warning: the pivot_longer step is time consuming
dfm_simil_df <- as.data.frame(as.matrix(dfm_simil))
colnames(dfm_simil_df) <- vblr_quanteda$id
dfm_simil_df_with_id <- cbind(vblr_quanteda_ids, dfm_simil_df)
longer_dfm_simil_df <- dfm_simil_df_with_id %>% pivot_longer(!vblr_quanteda_ids, names_to = "id_tweet_2", values_to = "similarity")
longer_dfm_simil_df <- rename(longer_dfm_simil_df, id_tweet_1 = vblr_quanteda_ids)
head(longer_dfm_simil_df)

# As the steps so far are time-consuming and memory-intensive, save dfm_simil_df and longer_dfm_simil_df as Rds to enable restart post-  textstat_simil stage
# NB: You may prefer to use the filtered version of longer_dfm_simil_df - created in the next stage - to restart as longer_dfm_simil_df is very large.

saveRDS(dfm_simil_df, file = "/home/ubuntu/data/shared_folder/Data/20220719_dfm_simil_df.rds")
saveRDS(longer_dfm_simil_df, file = "/home/ubuntu/data/shared_folder/Data/20220719_longer_dfm_simil_df.rds")

# Filter longer_dfm_simil_df to similarity scores better than or equal to 0.69 and save as Rds file for restart purposes

longer_dfm_simil_df_0.69 <- longer_dfm_simil_df %>% filter(similarity >= 0.69)
saveRDS(longer_dfm_simil_df_0.69, file = "/home/ubuntu/data/shared_folder/Data/20220719_longer_dfm_simil_df_069.rds")

# If you restart R session here to regain memory resources, to resume you need indo_dt, vertical_bands_links_removed and longer_dfm_simil_df_0.69

# Read in indo_dt df
indo_dt <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220719_indo_dt.rds")

# Read in the filtered similarity scores, longer_dfm_simil_df_0.69
longer_dfm_simil_df_0.69 <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220719_longer_dfm_simil_df_069.rds")

# Recreate vertical_bands_links_removed
vertical_bands <- indo_dt %>% 
  filter((as_hms("06:54:59") < times & times < hms::as_hms("06:56:00")) | (hms::as_hms("07:59:59") < times & times < hms::as_hms("08:01:00")))

twitter_links <- "(https?)(://t.co/)(.*?)(?=\\\\n|\\s|'|$)"

vertical_bands_links_removed <- vertical_bands %>% mutate(clean_text = str_remove_all(text, twitter_links))

## We now use igraph to generate a set of mutually distinct groups/components of 
## duplicate or near duplicate tweets derived from the pairwise similarity scores.

similars_0.69_igraph <- graph_from_data_frame(longer_dfm_simil_df_0.69, directed = FALSE, vertices = NULL)
similars_0.69_igraph_components <- components(similars_0.69_igraph)

# similars_0.69_igraph_components is a list with three elements: 
# $membership which is a list of each tweet id and the component number igraph
#   has assigned
# $csize which gives the size (ie no of tweets) of each component
# $no - no of components
# We take $membership and add the component numbers as a new column in
# vblr_links_removed

# First, extract the component numbers, initially as a numeric, and then as a
# df with the component numbers as one column - vblr_0.69_jaccard_components - and the 
# tweet ids as a second column - id
vblr_0.69_jaccard_components <- similars_0.69_igraph_components$membership
vblr_0.69_jaccard_components <- as.data.frame(vblr_0.69_jaccard_components)
vblr_0.69_jaccard_components$id <- row.names(vblr_0.69_jaccard_components)

# Now use merge to add the component numbers to vertical_bands_links_removed
# for convenience I'm going to shorten that unwieldy name to vblr
vblr <- merge(x = vertical_bands_links_removed, y = vblr_0.69_jaccard_components, 
              by.x = c("id"), by.y = c("id"), all.x = TRUE, all.Y = FALSE)

# We generated content codes by manually coding the first tweet in each set of 
# duplicates/near duplicates.
# I place the code to create the arranged data object for manual coding here

vblr_sorted_to_print <- vblr %>% arrange(vblr_0.69_jaccard_components) %>% 
  select(id, author_username, times, text, vblr_0.69_jaccard_components)

# Reading in the excel file with the content codes
content_codes <- readxl::read_xlsx(path = "/home/dave/papua-disinfo/vblr_with_0.69_simil_groups_sorted_to_print_coding_copy.xlsx")

# There are several empty tweets (once URLs are removed) that igraph  
# didn't assign to a component - these need to be filtered before merging content codes
# with the dataset.
content_codes <- content_codes %>% select(content_code, vblr_0.69_jaccard_components) %>% 
  filter(!is.na(content_code) & !is.na(vblr_0.69_jaccard_components))

# Final check for the content codes - does any component lack a code?
missed_any <- as.data.frame(c(1:3002))
missed_any <- missed_any %>% filter(!c(1:3002) %in% content_codes$vblr_0.69_jaccard_components)
missed_any

# Does any component have more than one code?
content_codes %>% count(vblr_0.69_jaccard_components, sort = TRUE) %>% head(30)

# Add content codes and size of each component as new columns to vblr
vblr <- merge(x = vblr, y = content_codes, 
                       by.x = c("vblr_0.69_jaccard_components"), 
                       by.y = c("vblr_0.69_jaccard_components"), all.x = TRUE, all.Y = FALSE)

j69_comp_sizes <- vblr %>% group_by(vblr_0.69_jaccard_components) %>% 
  summarise(j69_component_size = n()) %>% ungroup()

vblr <- merge(x = vblr, y = j69_comp_sizes, 
                       by.x = c("vblr_0.69_jaccard_components"), 
                       by.y = c("vblr_0.69_jaccard_components"), all.x = TRUE, all.Y = FALSE)

# Finally, simplify those content codes into genre codes and add that as a new column to vblr
vblr <- vblr %>% mutate(genre_codes = ifelse(content_code %in% c(1,6,11), 1, 
                                      ifelse(content_code %in% c(2,7,12), 2, 
                                      ifelse(content_code %in% c(3,8,13), 3,
                                      ifelse(content_code %in% c(4,9,14), 4,
                                      ifelse(content_code %in% c(5,10,15), 5,
                                      ifelse(content_code == 21, 6,
                                      ifelse(content_code %in% c(16,17), 7,
                                      ifelse(content_code == 28, 8,
                                      9)))))))))

# Generating components with Igraph is also memory-intensive, so a final jump-in  
# point for vblr including the content codes and component sizes
saveRDS(vblr, file = "/home/ubuntu/data/shared_folder/Data/20220719_vblr.rds")

## Read from file
vblr <- readRDS(file = "/home/ubuntu/data/shared_folder/Data/20220719_vblr.rds") %>%
  mutate(month = month(jkt_date_time, label = TRUE),
         vblr_0.69_jaccard_components = fct_explicit_na(factor(vblr_0.69_jaccard_components), "links_only"))

comp_order <- vblr %>% count(vblr_0.69_jaccard_components, sort = TRUE)

vblr$vblr_0.69_jaccard_components <- factor(vblr$vblr_0.69_jaccard_components, 
                                            levels = comp_order$vblr_0.69_jaccard_components)

# For convenience - as the information campaign is to be found most clearly 
# in original tweets mentioning special autonomy, we'll also create a df
# that is just original tweets mentioning otsus in the vertical bands
# The summary stats for Finding 3 are also calculated here 

vblr_orig_otsus <- vblr %>% 
  filter(is_original & otsus) %>%
  mutate(component_size = fct_rev(factor(j69_component_size)))

vblr_orig_otsus %>% nrow() # no of tweets
vblr_orig_otsus %>% distinct(author_id) %>% nrow() # no of distinct authors

vblr_orig_otsus %>% filter(j69_component_size == 1) %>% nrow()
  # to calculate percentage of unique original otsus tweets we first need 
  # the above set of tweets that appear just once = 758

vblr_orig_otsus %>%filter(is.na(vblr_0.69_jaccard_components)) %>% nrow()
  #plus tweets not assigned to a component = 2

# Percentage of unique tweets amongst original 
# otsus tweets in vertical bands
(758 +2)/(vblr_orig_otsus %>% nrow())

# Tally of remainder of original otsus tweets 
vblr_orig_otsus %>% filter(!is.na(vblr_0.69_jaccard_components) & 
                             j69_component_size > 1) %>%
  distinct(vblr_0.69_jaccard_components) %>% nrow()

# Percentages of entire dataset original/all tweets mentioning
# special autonomy for Finding 4
(22479 - 758 - 2)/(indo_dt %>% filter(is_original & otsus) %>% nrow()) #0.1397727
(22479 - 758 - 2)/(indo_dt %>% filter(otsus) %>% nrow()) #0.07896842

# Choose four maximally different coulours not used in other figures, from https://colorbrewer2.org/
cbPalette <- c("#e7298a", "#7570b3", "#d95f02", "#1b9e77")

# A plot showing tweets per day in each vertical bands, illustrating the stark
# shift from one band to the other on 13 April 2021

vblr %>% 
  filter(jkt_date_time >= "2021-02-01") %>% 
  mutate(date_bins = floor_date(jkt_date_time, "day", week_start = getOption("lubridate.week.start", 7)), 
         month=month(date_bins, label = TRUE),
         which_band = ifelse(times < hms::as_hms("06:56:00"), "Time of tweet: 06:55 band", "Time of tweet: 08:00 band")) %>%
  ggplot(aes(date_bins, fill = month)) +
  geom_bar() +
  scale_fill_manual(values=cbPalette) +
  scale_y_continuous(expand = c(0, 0))+
  scale_x_datetime(breaks = ymd_h(c("2021-02-15 12", "2021-03-15 12","2021-04-15 12","2021-05-15 12")), 
                   date_labels = "%B", limits = ymd_h(c("2021-01-31 00", "2021-06-01 24")), expand=c(0,0)) +
  theme(axis.text.y = element_blank(), axis.ticks = element_blank(), panel.grid.major.x = element_blank(), 
        panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())+
  labs(x = "Date tweet posted (2021)", y = "Tweets per day in each vertical band", fill= "") +
  theme_minimal(base_size = 12) +
  facet_grid(which_band~.)

ggsave(file = "figures/2_new_date_vb_posted_plot.png", width=2400, height=2000, units="px")


# Most duplicated tweet in vertical bands
vblr %>% count(vblr_0.69_jaccard_components, sort = TRUE) %>% 
  head(56) #component no 18

# All 286 instances in component 18 are original tweets mentioning otsus
vblr %>% filter(vblr_0.69_jaccard_components == 18 & is_original & otsus) %>% nrow()

# No of distinct authors posting component 18 tweets
vblr %>% filter(vblr_0.69_jaccard_components == 18) %>% 
  distinct(author_id) %>% nrow()

# Plot of dates the 286 duplicates of this tweet in the vertical bands are posted
# using the number of distinct authors posting the tweet per day as the metric
vblr %>%
  filter(vblr_0.69_jaccard_components == 18) %>% 
  mutate(date_bins = floor_date(jkt_date_time, "day", week_start = getOption("lubridate.week.start", 7)), 
         month=month(date_bins, label = TRUE),
         which_band = ifelse(times < hms::as_hms("06:56:00"), "Time of tweet: 06:55 band", "Time of tweet: 08:00 band")) %>%
  group_by(date_bins) %>% distinct(author_username, .keep_all = TRUE) %>% ungroup() %>%  
  ggplot(aes(date_bins, fill = month)) +
  geom_bar() +
  scale_y_continuous(expand = c(0, 0))+
  scale_fill_manual(values=cbPalette[2:4]) +
  scale_x_datetime(breaks = ymd_h(c("2021-02-15 12", "2021-03-15 12","2021-04-15 12","2021-05-15 12")), 
                   date_labels = "%B", limits = ymd_h(c("2021-02-07 00", "2021-05-30 24")), expand=c(0,0)) +
  theme(axis.text.y = element_blank(), axis.ticks = element_blank(), panel.grid.major.x = element_blank(), 
        panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())+
  labs(x = "Date tweet posted (2021)", y = "Distinct twitter users posting most duplicated tweet per day", fill= "") +
  theme_minimal(base_size = 12) +
  facet_grid(which_band~.)

ggsave(file = "figures/3_new_date_authors_post_duplicates_plot.png", width=2400, height=2000, units="px")

# Plot of dates posted for top 12 most duplicated tweets in vertical bands
#again using distinct authors posting per day as metric

custom_labels <- setNames(paste0("n = ", comp_order$n), comp_order$vblr_0.69_jaccard_components)

vblr %>% 
  mutate(month = month(jkt_date_time, label = TRUE), 
         date_bins = floor_date(jkt_date_time, "day", week_start = getOption("lubridate.week.start", 7)),
         which_band = ifelse(times < hms::as_hms("06:56:00"), "Time of tweet: 06:55 band", "Time of tweet: 08:00 band")) %>%
  filter(j69_component_size >= 126) %>%
  group_by(date_bins) %>% distinct(author_username, .keep_all = TRUE) %>% ungroup() %>% 
  ggplot(aes(date_bins, fill = month)) +
  geom_bar() +
  theme_minimal(base_size = 12) +
  scale_fill_manual(values=cbPalette) +
  scale_x_datetime(breaks = ymd_h(c("2021-02-15 12", "2021-03-15 12","2021-04-15 12","2021-05-15 12")), 
                   date_labels = "%B", limits = ymd_h(c("2021-02-07 00", "2021-05-30 24")), expand=c(0,0)) +
  theme(axis.text.y = element_blank(), axis.text.x = element_blank(),  axis.ticks = element_blank(),
        panel.spacing = unit(1, "cm", data = NULL)) +
  labs(x = "Date of posts (2021)", y = "Distinct Twitter users posting per day", fill="") +
  facet_wrap(~vblr_0.69_jaccard_components, labeller = as_labeller(custom_labels))

ggsave(file = "figures/4_new_top_12_plot.png", width=2400, height=2000, units="px")

# Plot of all original tweets in the vertical bands posted by authors of the
# most duplicated tweet. Two stages - first generate a list of authors; 
# then filter to create the plot

most_dupl_286_authors <- vblr %>% filter(j69_component_size == 286) %>% 
  distinct(author_id) %>% pull(author_id)

vblr %>% 
  mutate(month = month(jkt_date_time, label = TRUE), 
         date_bins = floor_date(jkt_date_time, "day", week_start = getOption("lubridate.week.start", 7)),
         which_band = ifelse(times < hms::as_hms("06:56:00"), "Time of tweet: 06:55 band", "Time of tweet: 08:00 band")) %>%
  filter(author_id %in% most_dupl_286_authors & is_original) %>%
  group_by(date_bins) %>% distinct(author_username, .keep_all = TRUE) %>% ungroup() %>% 
  ggplot(aes(date_bins, fill = month)) +
  geom_bar() +
  theme_minimal(base_size = 12) +
  scale_fill_manual(values=cbPalette) +
  scale_x_datetime(breaks = ymd_h(c("2021-02-15 12", "2021-03-15 12","2021-04-15 12","2021-05-15 12")), 
                   date_labels = "%B", limits = ymd_h(c("2021-01-31 00", "2021-06-01 24")), expand=c(0,0)) +
  theme(axis.text.y = element_blank(), axis.ticks = element_blank()) +
  labs(x = "Date of posts (2021)", y = "Distinct Twitter users posting per day", fill="") +
  facet_grid(which_band~.)

ggsave(file = "figures/5_new_all_286_author_tweets_plot.png", width=2400, height=2000, units="px")

# We also take a long characteristic string from component 286 and use this to
# identify the number of times this tweet appears in the entire dataset
indo_dt %>% filter(str_detect(text, 
    "menjadi bukti keseriusan pemerintah dalam meningkatkan kesejahteraan masyarakat di Bumi Cenderawasih")) %>% 
  nrow() #445

## Final task for this script - use Quanteda again to find the top hashtags 
## in the vertical bands
## For simplicity we use the data object names from the Quanteda example:
## https://tutorials.quanteda.io/statistical-analysis/frequency/
## first just for original tweets mentioning special autonomy
toks_tweets <- tokens(vblr_orig_otsus$clean_text, remove_punct = TRUE) %>% 
  tokens_keep(pattern = "#*")
dfmat_tweets <- dfm(toks_tweets)
tstat_freq <- textstat_frequency(dfmat_tweets, n = NULL, groups = vblr_orig_otsus$otsus)
head(tstat_freq, 50)

# Hashtags in the non-original otsus portion
vblr_the_rest_of_the_tweets <- vblr %>% filter(!id %in% vblr_orig_otsus$id)
toks_tweets1 <- tokens(vblr_the_rest_of_the_tweets$clean_text, remove_punct = TRUE) %>% 
  tokens_keep(pattern = "#*")
dfmat_tweets1 <- dfm(toks_tweets1)
tstat_freq1 <- textstat_frequency(dfmat_tweets1, n = NULL, groups = vblr_the_rest_of_the_tweets$otsus)
head(tstat_freq1, 50)

## Analysis continues in '5_vblr_topics_and_author_stats.R'
